This notebook goes through step by step how to save off an RDD Schema

Doing this will sigificantly decrease the amount of time necessary to load in a json file


In [7]:
from pyspark.sql.types import StructType
import json

In [1]:
data_path = "wiki_edit_data.json"
#read in the data, sadly without a schema
wiki_edits = sqlCtx.read.json(data_path)

In [14]:
wiki_edits.printSchema()


root
 |-- article_id: long (nullable = true)
 |-- article_namespace: long (nullable = true)
 |-- article_title: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- parent_id: long (nullable = true)
 |-- redirect_target: string (nullable = true)
 |-- revision_id: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_name: string (nullable = true)


In [2]:
#original schema
wiki_edits.schema


Out[2]:
StructType(List(StructField(article_id,LongType,true),StructField(article_namespace,LongType,true),StructField(article_title,StringType,true),StructField(comment,StringType,true),StructField(minor,BooleanType,true),StructField(parent_id,LongType,true),StructField(redirect_target,StringType,true),StructField(revision_id,LongType,true),StructField(timestamp,StringType,true),StructField(user_id,LongType,true),StructField(user_name,StringType,true)))

In [3]:
s = wiki_edits.schema.json()
s


Out[3]:
'{"fields":[{"metadata":{},"name":"article_id","nullable":true,"type":"long"},{"metadata":{},"name":"article_namespace","nullable":true,"type":"long"},{"metadata":{},"name":"article_title","nullable":true,"type":"string"},{"metadata":{},"name":"comment","nullable":true,"type":"string"},{"metadata":{},"name":"minor","nullable":true,"type":"boolean"},{"metadata":{},"name":"parent_id","nullable":true,"type":"long"},{"metadata":{},"name":"redirect_target","nullable":true,"type":"string"},{"metadata":{},"name":"revision_id","nullable":true,"type":"long"},{"metadata":{},"name":"timestamp","nullable":true,"type":"string"},{"metadata":{},"name":"user_id","nullable":true,"type":"long"},{"metadata":{},"name":"user_name","nullable":true,"type":"string"}],"type":"struct"}'

In [4]:
type(wiki_edits.schema.json())


Out[4]:
str

In [5]:
#now the data is in a string format, to be able to dump it we really want it in a json format
#for this we use json.loads()

In [8]:
with open('wiki_schema.json', 'w') as f:
    json.dump(json.loads(s), f)

In [15]:
#file is saved!  Lets make sure if worked by loading it back in :)

In [9]:
with open('wiki_schema.json', 'r') as f:
     json_in = json.load(f)

In [10]:
#what does the current format look like now?
json_in


Out[10]:
{u'fields': [{u'metadata': {},
   u'name': u'article_id',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'article_namespace',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'article_title',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {},
   u'name': u'comment',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {}, u'name': u'minor', u'nullable': True, u'type': u'boolean'},
  {u'metadata': {},
   u'name': u'parent_id',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'redirect_target',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {},
   u'name': u'revision_id',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'timestamp',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {}, u'name': u'user_id', u'nullable': True, u'type': u'long'},
  {u'metadata': {},
   u'name': u'user_name',
   u'nullable': True,
   u'type': u'string'}],
 u'type': u'struct'}

In [11]:
#to be able to use we need to have a StructType again
schema_in = StructType.fromJson(json_in)
type(schema_in)


Out[11]:
pyspark.sql.types.StructType

In [12]:
#read in the data again, this time with the schema!!!!
wiki_edits2 = sqlCtx.read.json(data_path, schema=StructType.fromJson(json_in))

In [13]:
#Now loading is a whole lot faster!!! but we should check the schema just to make sure the trolls didn't eat anything
wiki_edits2.printSchema()


root
 |-- article_id: long (nullable = true)
 |-- article_namespace: long (nullable = true)
 |-- article_title: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- parent_id: long (nullable = true)
 |-- redirect_target: string (nullable = true)
 |-- revision_id: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_name: string (nullable = true)


In [ ]: